"""
Created on 2017年8月2日

@author: xiaoj
"""
import re
import urllib
from bs4 import BeautifulSoup


class HtmlParser:
    def parse(self, text, new_url):
        soup = BeautifulSoup(text, "html.parser")
        return self._get_new_url(soup, new_url), self._get_new_data(soup, new_url)

    def _get_new_url(self, soup, new_url):
        new_urls = []
        links = soup.find_all("a", href=re.compile("item"))
        for link in links:
            new_urls.append(urllib.parse.urljoin(new_url, link.get("href")))
        return new_urls

    def _get_new_data(self, soup, new_url):
        new_data = {}
        title = soup.find("dd", class_="lemmaWgt-lemmaTitle-title").find("h1").get_text()
        summary = soup.find("div", class_="lemma-summary")
        if summary is None:
            return None
        new_data["title"] = title
        new_data["summary"] = summary.get_text().replace("\n", "")
        new_data["url"] = new_url
        return new_data
